#First pass at audit Validation graphs
library(readr)
library(haven)
library(tidyverse)
library(magrittr)

#Define functions for later use
add_narm <- function(a,b,c,d,e,f){
  a = ifelse(is.na(a)==TRUE,0,a)
  b = ifelse(is.na(b)==TRUE,0,b)
  c = ifelse(is.na(c)==TRUE,0,c)
  d = ifelse(is.na(d)==TRUE,0,d)
  e = ifelse(is.na(e)==TRUE,0,e)
  f = ifelse(is.na(f)==TRUE,0,f)
  a+b+c+d+e+f
}

#function to remove quotations from a name
quotes_remove <- function(x){
  name <- str_split(x,"'")
  name_fixed <- rep(NA,length(x))
  for(i in 1:length(x)){
    name_fixed[i] <- name[[i]][2]
  }
  name_fixed <- str_trim(name_fixed,side="right")
}

sum_narm <- function(x){
  sum(x,na.rm = TRUE)
}

mean_narm <- function(x){
  mean(x,na.rm = TRUE)
}

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
#Create Regression Data
load("Z:/Race_Prediction/Data/NTJ Short Paper/lihtc_17_biFsg.RData")
full_data <- lihtc_bisg

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
#Add in variable for ATR calculations
lihtc_combined <- read_dta("Y:/BIFSG/Validation/StataFiles/combined/combined_bifsg_v.dta")

lihtc_combined$filer <- ifelse(is.na(lihtc_combined$adjgross)==FALSE,1,0)

merge_data <- select(lihtc_combined,filer_ssn,aotc,nrctc,actc,ex_ssa,fica,tot_inc,tot_tax,
					eic,w2_wages,ssa_wages,irmf_wages,w2_tax,irmf_tax,ssa_tax,filer,adjgross,audit)

full_data_merge <- merge(full_data,merge_data,by="filer_ssn",all=FALSE)


#Add spousal data for non-filers
sp_merge <- subset(merge_data,(merge_data$filer_ssn %in% full_data$spouse_ssn) & merge_data$filer==0)
sp_merge <- select(sp_merge,spouse_ssn,sp_income,sp_tax)


full_data_merge <- merge(full_data_merge,sp_merge,by="spouse_ssn",all.x=TRUE,all.y=FALSE)
save <- full_data_merge

data_select <- subset(full_data_merge,is.na(full_data_merge$atr)==FALSE &
                        full_data_merge$atr<=1 &
                        full_data_merge$atr >=-1)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++
#Append the spouse BISG probabilities

spouses <- subset(data_select,is.na(data_select$spouse_ssn)==FALSE & is.na(data_select$l2)==FALSE)
spouses %<>%
  mutate(prob_b1 = sp_prob_b1,
         prob_b2 = sp_prob_b2,
         prob_b3 = sp_prob_b3,
         prob_b4 = sp_prob_b4,
         prob_b5 = sp_prob_b5,
         prob_b6 = sp_prob_b6,
         demographic = sp_demographic,
         ssn = spouse_ssn)%>%
  select(ssn,starts_with("prob_b"),demographic,filer,agi,atr,eitc,audit)

full_data_select <- data_select
full_data_select$ssn <- full_data_select$filer_ssn
spouses <- subset(spouses,!(spouses$ssn %in% full_data_select$ssn))

full_data_select <- select(full_data_select,ssn,starts_with("prob_b"),demographic,filer,agi,atr,eitc,audit)
full_data_select <- rbind(full_data_select,spouses)
full_data_select$prim_filer <- ifelse((full_data_select$ssn %in% spouses$ssn) | full_data_select$filer==0,0,1)

#Eliminate people for whom we do not observe race
full_data_select <- subset(full_data_select,is.na(full_data_select$demographic)==FALSE)

write.csv(full_data_select,"Z:/Race_Prediction/Data/NTJ short paper/reg_data.csv",row.names = FALSE)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++
#Re-load the data (start here if data already created)

reg_data <- read.csv("Z:/Race_Prediction/Data/NTJ short paper/reg_data.csv")
lihtc_bisg <- as.data.frame(subset(reg_data,reg_data$filer==1))

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# First stage validation graphs

#Graphing them all together
white <- subset(lihtc_bisg$prob_b1,lihtc_bisg$demographic==1)
black <- subset(lihtc_bisg$prob_b2,lihtc_bisg$demographic==2)
asian <- subset(lihtc_bisg$prob_b3,lihtc_bisg$demographic==3)
native <- subset(lihtc_bisg$prob_b4,lihtc_bisg$demographic==4)
multir <- subset(lihtc_bisg$prob_b5,lihtc_bisg$demographic==5)
hispanic <- subset(lihtc_bisg$prob_b6,lihtc_bisg$demographic==6)



par(mfrow=c(3,2)) 
par(bg=NA)
hist(white,col="firebrick",main="White",border="white",
     xlab="Probability",ylab="Percent",xlim=c(0,1),freq = FALSE,
     cex.lab=1.5, cex.main=1.8)
hist(black,col="lightskyblue",main="Black",border="white",
     xlab="Probability",ylab="Percent",xlim=c(0,1),freq = FALSE,
     cex.lab=1.5, cex.main=1.8)
hist(asian,col="forestgreen",main="ANHPI",border="white",
     xlab="Probability",ylab="Percent",xlim=c(0,1),freq = FALSE,
     cex.lab=1.5, cex.main=1.8)
hist(native,col="orange",main="Native",border="white",breaks=20,
     xlab="Probability",ylab="Percent",xlim=c(0,1),freq = FALSE,
     cex.lab=1.5, cex.main=1.8)
hist(multir,col="purple",main="Multiracial",border="white",
     xlab="Probability",ylab="Percent",xlim=c(0,1),freq = FALSE,
     cex.lab=1.5, cex.main=1.8)
hist(hispanic,col="gold",main="Hispanic",border="white",
     xlab="Probability",ylab="Percent",xlim=c(0,1),freq = FALSE,
     cex.lab=1.5, cex.main=1.8)


#Calculate median predicted probabilities
white_id <- rep(0,6)
black_id <- rep(0,6)
asian_id <- rep(0,6)
native_id <- rep(0,6)
multir_id <- rep(0,6)
hispanic_id <- rep(0,6)

for(i in 1:6){
  white_id[i] <- round(mean_narm(subset(lihtc_bisg[,(i+1)],lihtc_bisg$demographic==1)),2)
  black_id[i] <- round(mean_narm(subset(lihtc_bisg[,(i+1)],lihtc_bisg$demographic==2)),2)
  asian_id[i] <- round(mean_narm(subset(lihtc_bisg[,(i+1)],lihtc_bisg$demographic==3)),2)
  native_id[i] <- round(mean_narm(subset(lihtc_bisg[,(i+1)],lihtc_bisg$demographic==4)),2)
  multir_id[i] <- round(mean_narm(subset(lihtc_bisg[,(i+1)],lihtc_bisg$demographic==5)),2)
  hispanic_id[i] <- round(mean_narm(subset(lihtc_bisg[,(i+1)],lihtc_bisg$demographic==6)),2)
}

paper_table <- cbind(white_id,
black_id,
asian_id,
native_id,
multir_id,
hispanic_id)

paper_table

#Calculate population totals
true_pop <- rep(0,6)
predict_pop <- rep(0,6)

lihtc_bisg$index <- 1
lihtc_bisg <- subset(lihtc_bisg,is.na(lihtc_bisg$demographic)==FALSE)

for(i in 1:6){
true_pop[i] <- sum(subset(lihtc_bisg$index,lihtc_bisg$demographic==i))
predict_pop[i] <- sum(lihtc_bisg[,i+1])
}

true_pop
predict_pop

#Test
sum(true_pop)
sum(predict_pop)